

<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" />
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  
  <title>mindspore.dataset.text.SentencePieceVocab &mdash; MindSpore master documentation</title>
  

  
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />

  
  

  
  

  

  
  <!--[if lt IE 9]>
    <script src="../../_static/js/html5shiv.min.js"></script>
  <![endif]-->
  
    
      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
        <script src="../../_static/jquery.js"></script>
        <script src="../../_static/underscore.js"></script>
        <script src="../../_static/doctools.js"></script>
        <script src="../../_static/language_data.js"></script>
        <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
    
    <script type="text/javascript" src="../../_static/js/theme.js"></script>

    
    <link rel="index" title="Index" href="../../genindex.html" />
    <link rel="search" title="Search" href="../../search.html" />
    <link rel="next" title="mindspore.dataset.text.SPieceTokenizerLoadType" href="mindspore.dataset.text.SPieceTokenizerLoadType.html" />
    <link rel="prev" title="mindspore.dataset.text.SentencePieceModel" href="mindspore.dataset.text.SentencePieceModel.html" /> 
</head>

<body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >
          

          
            <a href="../../index.html" class="icon icon-home"> MindSpore
          

          
          </a>

          
            
            
          

          
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        
        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
              
            
            
              <p class="caption"><span class="caption-text">MindSpore Python API</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../mindspore.html">mindspore</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.common.initializer.html">mindspore.common.initializer</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.communication.html">mindspore.communication</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.compression.html">mindspore.compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.context.html">mindspore.context</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.dataset.html">mindspore.dataset</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.dataset.audio.html">mindspore.dataset.audio</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.dataset.config.html">mindspore.dataset.config</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../mindspore.dataset.text.html">mindspore.dataset.text</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../mindspore.dataset.text.html#mindspore-dataset-text-transforms">mindspore.dataset.text.transforms</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="../mindspore.dataset.text.html#mindspore-dataset-text-utils">mindspore.dataset.text.utils</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.JiebaMode.html">mindspore.dataset.text.JiebaMode</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.NormalizeForm.html">mindspore.dataset.text.NormalizeForm</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.SentencePieceModel.html">mindspore.dataset.text.SentencePieceModel</a></li>
<li class="toctree-l3 current"><a class="current reference internal" href="#">mindspore.dataset.text.SentencePieceVocab</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.SPieceTokenizerLoadType.html">mindspore.dataset.text.SPieceTokenizerLoadType</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.SPieceTokenizerOutType.html">mindspore.dataset.text.SPieceTokenizerOutType</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.to_str.html">mindspore.dataset.text.to_str</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.to_bytes.html">mindspore.dataset.text.to_bytes</a></li>
<li class="toctree-l3"><a class="reference internal" href="mindspore.dataset.text.Vocab.html">mindspore.dataset.text.Vocab</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.dataset.transforms.html">mindspore.dataset.transforms</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.dataset.vision.html">mindspore.dataset.vision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.mindrecord.html">mindspore.mindrecord</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.nn.html">mindspore.nn</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.nn.probability.html">mindspore.nn.probability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.nn.transformer.html">mindspore.nn.transformer</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.numpy.html">mindspore.numpy</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.ops.html">mindspore.ops</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.parallel.html">mindspore.parallel</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.parallel.nn.html">mindspore.parallel.nn</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.profiler.html">mindspore.profiler</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.scipy.html">mindspore.scipy</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.train.html">mindspore.train</a></li>
<li class="toctree-l1"><a class="reference internal" href="../mindspore.boost.html">mindspore.boost</a></li>
</ul>
<p class="caption"><span class="caption-text">MindSpore C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://www.mindspore.cn/lite/api/zh-CN/master/api_cpp/mindspore.html">MindSpore Lite↗</a></li>
</ul>

            
          
        </div>
        
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
      <nav class="wy-nav-top" aria-label="top navigation">
        
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../index.html">MindSpore</a>
        
      </nav>


      <div class="wy-nav-content">
        
        <div class="rst-content">
        
          

















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">
    
      <li><a href="../../index.html" class="icon icon-home"></a> &raquo;</li>
        
          <li><a href="../mindspore.dataset.text.html">mindspore.dataset.text</a> &raquo;</li>
        
      <li>mindspore.dataset.text.SentencePieceVocab</li>
    
    
      <li class="wy-breadcrumbs-aside">
        
          
            <a href="../../_sources/api_python/dataset_text/mindspore.dataset.text.SentencePieceVocab.rst.txt" rel="nofollow"> View page source</a>
          
        
      </li>
    
  </ul>

  
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="section" id="mindspore-dataset-text-sentencepiecevocab">
<h1>mindspore.dataset.text.SentencePieceVocab<a class="headerlink" href="#mindspore-dataset-text-sentencepiecevocab" title="Permalink to this headline">¶</a></h1>
<dl class="class">
<dt id="mindspore.dataset.text.SentencePieceVocab">
<em class="property">class </em><code class="sig-prename descclassname">mindspore.dataset.text.</code><code class="sig-name descname">SentencePieceVocab</code><a class="headerlink" href="#mindspore.dataset.text.SentencePieceVocab" title="Permalink to this definition">¶</a></dt>
<dd><p>用于执行分词的SentencePiece对象。</p>
<dl class="method">
<dt id="mindspore.dataset.text.SentencePieceVocab.from_dataset">
<code class="sig-name descname">from_dataset</code><span class="sig-paren">(</span><em class="sig-param">dataset</em>, <em class="sig-param">col_names</em>, <em class="sig-param">vocab_size</em>, <em class="sig-param">character_coverage</em>, <em class="sig-param">model_type</em>, <em class="sig-param">params</em><span class="sig-paren">)</span><a class="headerlink" href="#mindspore.dataset.text.SentencePieceVocab.from_dataset" title="Permalink to this definition">¶</a></dt>
<dd><p>从数据集构建SentencePiece。</p>
<p><strong>参数：</strong></p>
<ul class="simple">
<li><p><strong>dataset</strong> (Dataset) - 表示用于构建SentencePiece对象的数据集。</p></li>
<li><p><strong>col_names</strong> (list) - 表示列名称的列表。</p></li>
<li><p><strong>vocab_size</strong> (int) - 表示词汇大小。</p></li>
<li><p><strong>character_coverage</strong> (float) - 表示模型涵盖的字符数。推荐的默认值为：0.9995，适用于具有丰富字符集的语言，如日文或中文，1.0适用于具有小字符集的其他语言。</p></li>
<li><p><strong>model_type</strong> (SentencePieceModel) - 其值可以是SentencePieceModel.UNIGRAM、SentencePieceModel.BPE、SentencePieceModel.CHAR或SentencePieceModel.WORD，默认值：SentencePieceModel.UNIgram。使用SentencePieceModel.WORD类型时，必须预先标记输入句子。</p>
<ul>
<li><p>SentencePieceModel.UNIGRAM：Unigram语言模型意味着句子中的下一个单词被假定为独立于模型生成的前一个单词。</p></li>
<li><p>SentencePieceModel.BPE：指字节对编码算法，它取代了最频繁的对句子中的字节数，其中包含一个未使用的字节。</p></li>
<li><p>SentencePieceModel.CHAR：引用基于字符的SentencePiece模型类型。</p></li>
<li><p>SentencePieceModel.WORD：引用基于单词的SentencePiece型类型。</p></li>
</ul>
</li>
<li><p><strong>params</strong> (dict)：表示没有传入参数的字典。</p></li>
</ul>
<p><strong>返回：</strong></p>
<p>SentencePieceVocab，从数据集构建的vocab。</p>
<p><strong>样例：</strong></p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">mindspore.dataset.text</span> <span class="kn">import</span> <span class="n">SentencePieceModel</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">TextFileDataset</span><span class="p">(</span><span class="s2">&quot;/path/to/sentence/piece/vocab/file&quot;</span><span class="p">,</span> <span class="n">shuffle</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vocab</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">SentencePieceVocab</span><span class="o">.</span><span class="n">from_dataset</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;text&quot;</span><span class="p">],</span> <span class="mi">5000</span><span class="p">,</span> <span class="mf">0.9995</span><span class="p">,</span>
<span class="gp">... </span>                                             <span class="n">SentencePieceModel</span><span class="o">.</span><span class="n">UNIGRAM</span><span class="p">,</span> <span class="p">{})</span>
</pre></div>
</div>
</dd></dl>

<dl class="method">
<dt id="mindspore.dataset.text.SentencePieceVocab.from_file">
<code class="sig-name descname">from_file</code><span class="sig-paren">(</span><em class="sig-param">file_path</em>, <em class="sig-param">vocab_size</em>, <em class="sig-param">character_coverage</em>, <em class="sig-param">model_type</em>, <em class="sig-param">params</em><span class="sig-paren">)</span><a class="headerlink" href="#mindspore.dataset.text.SentencePieceVocab.from_file" title="Permalink to this definition">¶</a></dt>
<dd><p>从单词列表中构建一个SentencePiece对象。</p>
<p><strong>参数：</strong></p>
<ul>
<li><p><strong>file_path</strong> (list) - 表示包含SentencePiece列表的文件的路径。</p></li>
<li><p><strong>vocab_size</strong> (int) - 表示词汇大小。</p></li>
<li><p><strong>character_coverage</strong> (float) - 表示模型涵盖的字符数。推荐的默认值为：0.9995适用于具有丰富字符集的语言，如日文或中文，1.0适用于具有小字符集的其他语言。</p></li>
<li><p><strong>model_type</strong> (SentencePieceModel) - 其值可以是SentencePieceModel.UNIGRAM、SentencePieceModel.BPE、SentencePieceModel.CHAR或SentencePieceModel.WORD，默认值为SentencePieceModel.UNIgram。使用SentencePieceModel.WORD类型时，必须预先标记输入句子。</p>
<ul class="simple">
<li><p>SentencePieceModel.UNIGRAM：Unigram语言模型意味着句子中的下一个单词被假定为独立于模型生成的前一个单词。</p></li>
<li><p>SentencePieceModel.BPE：指字节对编码算法，它取代了最频繁的对句子中的字节数，其中包含一个未使用的字节。</p></li>
<li><p>SentencePieceModel.CHAR：引用基于字符的SentencePiece模型类型。</p></li>
<li><p>SentencePieceModel.WORD：引用基于单词的SentencePiece型类型。</p></li>
</ul>
</li>
<li><p><strong>params</strong> (dict)：表示没有传入参数的字典（参数派生自SentencePiece库）。</p>
<blockquote>
<div><div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">input_sentence_size</span> <span class="mi">0</span>
<span class="n">max_sentencepiece_length</span> <span class="mi">16</span>
</pre></div>
</div>
</div></blockquote>
</li>
</ul>
<p><strong>返回：</strong></p>
<p>SentencePieceVocab，表示从文件中构建的vocab。</p>
<p><strong>样例：</strong></p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">mindspore.dataset.text</span> <span class="kn">import</span> <span class="n">SentencePieceModel</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vocab</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">SentencePieceVocab</span><span class="o">.</span><span class="n">from_file</span><span class="p">([</span><span class="s2">&quot;/path/to/sentence/piece/vocab/file&quot;</span><span class="p">],</span> <span class="mi">5000</span><span class="p">,</span> <span class="mf">0.9995</span><span class="p">,</span>
<span class="gp">... </span>                                          <span class="n">SentencePieceModel</span><span class="o">.</span><span class="n">UNIGRAM</span><span class="p">,</span> <span class="p">{})</span>
</pre></div>
</div>
</dd></dl>

<dl class="method">
<dt id="mindspore.dataset.text.SentencePieceVocab.save_model">
<code class="sig-name descname">save_model</code><span class="sig-paren">(</span><em class="sig-param">vocab</em>, <em class="sig-param">path</em>, <em class="sig-param">filename</em><span class="sig-paren">)</span><a class="headerlink" href="#mindspore.dataset.text.SentencePieceVocab.save_model" title="Permalink to this definition">¶</a></dt>
<dd><p>将模型保存到给定的文件路径。</p>
<p><strong>参数：</strong></p>
<ul class="simple">
<li><p><strong>vocab</strong> (SentencePieceVocab) - 表示一个SentencePiece对象。</p></li>
<li><p><strong>path</strong> (str) - 表示存储模型的路径。</p></li>
<li><p><strong>filename</strong> (str) - 表示文件名称。</p></li>
</ul>
<p><strong>样例：</strong></p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">mindspore.dataset.text</span> <span class="kn">import</span> <span class="n">SentencePieceModel</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">vocab</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">SentencePieceVocab</span><span class="o">.</span><span class="n">from_file</span><span class="p">([</span><span class="s2">&quot;/path/to/sentence/piece/vocab/file&quot;</span><span class="p">],</span> <span class="mi">5000</span><span class="p">,</span> <span class="mf">0.9995</span><span class="p">,</span>
<span class="gp">... </span>                                          <span class="n">SentencePieceModel</span><span class="o">.</span><span class="n">UNIGRAM</span><span class="p">,</span> <span class="p">{})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">text</span><span class="o">.</span><span class="n">SentencePieceVocab</span><span class="o">.</span><span class="n">save_model</span><span class="p">(</span><span class="n">vocab</span><span class="p">,</span> <span class="s2">&quot;./&quot;</span><span class="p">,</span> <span class="s2">&quot;m.model&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>

</dd></dl>

</div>


           </div>
           
          </div>
          <footer>
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
        <a href="mindspore.dataset.text.SPieceTokenizerLoadType.html" class="btn btn-neutral float-right" title="mindspore.dataset.text.SPieceTokenizerLoadType" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
        <a href="mindspore.dataset.text.SentencePieceModel.html" class="btn btn-neutral float-left" title="mindspore.dataset.text.SentencePieceModel" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>
        &#169; Copyright 2021, MindSpore.

    </p>
  </div>
    
    
    
    Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    
    provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>
        </div>
      </div>

    </section>

  </div>
  

  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

  
  
    
   

</body>
</html>